{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cde8a9dc",
   "metadata": {},
   "source": [
    "# PDF parsing evaluation\n",
    "\n",
    "## Step 1: Create our dataset and download the PDFs\n",
    "\n",
    "Let's download a few arxiv pdf papers and create a dataset with them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "82b32983",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading PDFs...\n",
      "2505.19443.pdf already downloaded\n",
      "2506.23253.pdf already downloaded\n",
      "2506.11162v1.pdf already downloaded\n",
      "2507.00951v1.pdf already downloaded\n",
      "2505.17810.pdf already downloaded\n",
      "2407.12787v2.pdf already downloaded\n",
      "2411.10867.pdf already downloaded\n",
      "2410.12851v1.pdf already downloaded\n",
      "Early_2025_AI_Experienced_OS_Devs_Study.pdf already downloaded\n",
      "Done\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pdf</th>\n",
       "      <th>title</th>\n",
       "      <th>author_names</th>\n",
       "      <th>github_link</th>\n",
       "      <th>file</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://arxiv.org/pdf/2505.19443</td>\n",
       "      <td>Vibe Coding vs. Agentic Coding: Fundamentals a...</td>\n",
       "      <td>Ranjan Sapkota, Konstantinos I. Roumeliotis, M...</td>\n",
       "      <td></td>\n",
       "      <td>.files/2505.19443.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://arxiv.org/pdf/2506.23253</td>\n",
       "      <td>Vibe coding: programming through conversation ...</td>\n",
       "      <td>Advait Sarkar, Ian Drosos</td>\n",
       "      <td></td>\n",
       "      <td>.files/2506.23253.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://arxiv.org/pdf/2506.11162v1</td>\n",
       "      <td>VIBE: Can a VLM Read the Room?*</td>\n",
       "      <td>Tania Chakraborty, Eylon Caplan, Dan Goldwasser</td>\n",
       "      <td></td>\n",
       "      <td>.files/2506.11162v1.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>https://arxiv.org/pdf/2507.00951v1</td>\n",
       "      <td>Thinking Beyond Tokens: From Brain-Inspired In...</td>\n",
       "      <td>Rizwan Qureshi, Ranjan Sapkota, Abbas Shah, Am...</td>\n",
       "      <td></td>\n",
       "      <td>.files/2507.00951v1.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>https://arxiv.org/pdf/2505.17810</td>\n",
       "      <td>VIBE: Vector Index Benchmark for Embeddings</td>\n",
       "      <td>Elias Jääsaari, Ville Hyvönen, Matteo Ceccarel...</td>\n",
       "      <td>https://github.com/vector-index-bench/vibe</td>\n",
       "      <td>.files/2505.17810.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>https://arxiv.org/pdf/2407.12787v2</td>\n",
       "      <td>GameVibe: a multimodal affective game corpus</td>\n",
       "      <td>Matthew Barthet, Maria Kaselimi, Kosmas Pinita...</td>\n",
       "      <td></td>\n",
       "      <td>.files/2407.12787v2.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>https://arxiv.org/pdf/2411.10867</td>\n",
       "      <td>ViBe: A Text-to-Video Benchmark for Evaluating...</td>\n",
       "      <td>Vipula Rawte, Sarthak Jain, Aarush Sinha, Garv...</td>\n",
       "      <td>https://vibe-t2v-bench.github.io/</td>\n",
       "      <td>.files/2411.10867.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>https://arxiv.org/pdf/2410.12851v1</td>\n",
       "      <td>VibeCheck: Discover &amp; Quantify Qualitative Dif...</td>\n",
       "      <td>Lisa Dunlap, Krishna Mandal, Trevor Darrell, J...</td>\n",
       "      <td></td>\n",
       "      <td>.files/2410.12851v1.pdf</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>https://metr.org/Early_2025_AI_Experienced_OS_...</td>\n",
       "      <td>Measuring the Impact of Early-2025 AI on Exper...</td>\n",
       "      <td>Joel Becker, Nate Rush, Beth Barnes, David Rein</td>\n",
       "      <td></td>\n",
       "      <td>.files/Early_2025_AI_Experienced_OS_Devs_Study...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 pdf  \\\n",
       "0                   https://arxiv.org/pdf/2505.19443   \n",
       "1                   https://arxiv.org/pdf/2506.23253   \n",
       "2                 https://arxiv.org/pdf/2506.11162v1   \n",
       "3                 https://arxiv.org/pdf/2507.00951v1   \n",
       "4                   https://arxiv.org/pdf/2505.17810   \n",
       "5                 https://arxiv.org/pdf/2407.12787v2   \n",
       "6                   https://arxiv.org/pdf/2411.10867   \n",
       "7                 https://arxiv.org/pdf/2410.12851v1   \n",
       "8  https://metr.org/Early_2025_AI_Experienced_OS_...   \n",
       "\n",
       "                                               title  \\\n",
       "0  Vibe Coding vs. Agentic Coding: Fundamentals a...   \n",
       "1  Vibe coding: programming through conversation ...   \n",
       "2                    VIBE: Can a VLM Read the Room?*   \n",
       "3  Thinking Beyond Tokens: From Brain-Inspired In...   \n",
       "4        VIBE: Vector Index Benchmark for Embeddings   \n",
       "5       GameVibe: a multimodal affective game corpus   \n",
       "6  ViBe: A Text-to-Video Benchmark for Evaluating...   \n",
       "7  VibeCheck: Discover & Quantify Qualitative Dif...   \n",
       "8  Measuring the Impact of Early-2025 AI on Exper...   \n",
       "\n",
       "                                        author_names  \\\n",
       "0  Ranjan Sapkota, Konstantinos I. Roumeliotis, M...   \n",
       "1                          Advait Sarkar, Ian Drosos   \n",
       "2    Tania Chakraborty, Eylon Caplan, Dan Goldwasser   \n",
       "3  Rizwan Qureshi, Ranjan Sapkota, Abbas Shah, Am...   \n",
       "4  Elias Jääsaari, Ville Hyvönen, Matteo Ceccarel...   \n",
       "5  Matthew Barthet, Maria Kaselimi, Kosmas Pinita...   \n",
       "6  Vipula Rawte, Sarthak Jain, Aarush Sinha, Garv...   \n",
       "7  Lisa Dunlap, Krishna Mandal, Trevor Darrell, J...   \n",
       "8    Joel Becker, Nate Rush, Beth Barnes, David Rein   \n",
       "\n",
       "                                  github_link  \\\n",
       "0                                               \n",
       "1                                               \n",
       "2                                               \n",
       "3                                               \n",
       "4  https://github.com/vector-index-bench/vibe   \n",
       "5                                               \n",
       "6           https://vibe-t2v-bench.github.io/   \n",
       "7                                               \n",
       "8                                               \n",
       "\n",
       "                                                file  \n",
       "0                              .files/2505.19443.pdf  \n",
       "1                              .files/2506.23253.pdf  \n",
       "2                            .files/2506.11162v1.pdf  \n",
       "3                            .files/2507.00951v1.pdf  \n",
       "4                              .files/2505.17810.pdf  \n",
       "5                            .files/2407.12787v2.pdf  \n",
       "6                              .files/2411.10867.pdf  \n",
       "7                            .files/2410.12851v1.pdf  \n",
       "8  .files/Early_2025_AI_Experienced_OS_Devs_Study...  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import httpx\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    [\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2505.19443\",\n",
    "            \"title\": \"Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI\",\n",
    "            \"author_names\": \"Ranjan Sapkota, Konstantinos I. Roumeliotis, Manoj Karkee\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2506.23253\",\n",
    "            \"title\": \"Vibe coding: programming through conversation with artificial intelligence\",\n",
    "            \"author_names\": \"Advait Sarkar, Ian Drosos\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2506.11162v1\",\n",
    "            \"title\": \"VIBE: Can a VLM Read the Room?*\",\n",
    "            \"author_names\": \"Tania Chakraborty, Eylon Caplan, Dan Goldwasser\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2507.00951v1\",\n",
    "            \"title\": \"Thinking Beyond Tokens: From Brain-Inspired Intelligence to Cognitive Foundations for Artificial General Intelligence and its Societal Impact\",\n",
    "            \"author_names\": \"Rizwan Qureshi, Ranjan Sapkota, Abbas Shah, Amgad Muneer, Anas Zafar, Ashmal Vayani, Maged Shoman, Abdelrahman B. M. Eldaly, Kai Zhang, Ferhat Sadak, Shaina Raza, Xinqi Fan, Ravid Shwartz-Ziv, Hong Yan, Vinjia Jain, Aman Chadha, Manoj Karkee, Jia Wu, Philip Torr, Seyedali Mirjalili\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2505.17810\",\n",
    "            \"title\": \"VIBE: Vector Index Benchmark for Embeddings\",\n",
    "            \"author_names\": \"Elias Jääsaari, Ville Hyvönen, Matteo Ceccarello, Teemu Roos, Martin Aumüller\",\n",
    "            \"github_link\": \"https://github.com/vector-index-bench/vibe\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2407.12787v2\",\n",
    "            \"title\": \"GameVibe: a multimodal affective game corpus\",\n",
    "            \"author_names\": \"Matthew Barthet, Maria Kaselimi, Kosmas Pinitas, Konstantinos Makantasis, Antonios Liapis, Georgios N. Yannakakis\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2411.10867\",\n",
    "            \"title\": \"ViBe: A Text-to-Video Benchmark for Evaluating Hallucination in Large Multimodal Models\",\n",
    "            \"author_names\": \"Vipula Rawte, Sarthak Jain, Aarush Sinha, Garv Kaushik, Aman Bansal, Prathiksha Rumale Vishwanath, Samyak Rajesh Jain, Aishwarya Naresh Reganti, Vinija Jain, Aman Chadha, Amit Sheth, Amitava Das\",\n",
    "            \"github_link\": \"https://vibe-t2v-bench.github.io/\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://arxiv.org/pdf/2410.12851v1\",\n",
    "            \"title\": \"VibeCheck: Discover & Quantify Qualitative Differences in Large Language Models\",\n",
    "            \"author_names\": \"Lisa Dunlap, Krishna Mandal, Trevor Darrell, Jacob Steinhardt, Joseph Gonzalez\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "        {\n",
    "            \"pdf\": \"https://metr.org/Early_2025_AI_Experienced_OS_Devs_Study.pdf\",\n",
    "            \"title\": \"Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity\",\n",
    "            \"author_names\": \"Joel Becker, Nate Rush, Beth Barnes, David Rein\",\n",
    "            \"github_link\": \"\",\n",
    "        },\n",
    "    ]\n",
    ")\n",
    "\n",
    "print(\"Downloading PDFs...\")\n",
    "os.makedirs(\".files/\", exist_ok=True)\n",
    "for index, row in df.iterrows():\n",
    "    name = row[\"pdf\"].split(\"/\")[-1].replace(\".pdf\", \"\") + \".pdf\"\n",
    "    if os.path.exists(f\".files/{name}\"):\n",
    "        print(f\"{name} already downloaded\")\n",
    "    else:\n",
    "        response = httpx.get(row[\"pdf\"])\n",
    "        with open(f\".files/{name}\", \"wb\") as f:\n",
    "            f.write(response.content)\n",
    "            print(f\"Downloaded {name}\")\n",
    "    df.at[index, \"file\"] = f\".files/{name}\"\n",
    "print(\"Done\")\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60b0eec2",
   "metadata": {},
   "source": [
    "## Step 2: Define our unstructured data parting pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d42a902b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-07-11 12:25:50 - Loaded .env file\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Prediction(\n",
       "    title='Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI',\n",
       "    author_names='Ranjan Sapkota, Konstantinos I. Roumeliotis, Manoj Karkee',\n",
       "    github_link=None\n",
       ")"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import dspy\n",
    "import langwatch\n",
    "from unstructured.partition.pdf import partition_pdf\n",
    "from unstructured.staging.base import elements_to_text\n",
    "\n",
    "dspy.configure(lm=dspy.LM(\"openai/gpt-4o-mini\"))\n",
    "\n",
    "\n",
    "@langwatch.trace()\n",
    "def extract_pdf_info(filename):\n",
    "    langwatch.get_current_trace().autotrack_dspy()\n",
    "\n",
    "    elements = partition_pdf(filename=filename)\n",
    "    pdf = elements_to_text(elements=elements)\n",
    "\n",
    "    return dspy.Predict(\n",
    "        \"pdf -> title: str, author_names: str, github_link: Optional[str]\"\n",
    "    )(pdf=pdf)\n",
    "\n",
    "\n",
    "extract_pdf_info(df[\"file\"][0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab717578",
   "metadata": {},
   "source": [
    "# Step 3: Run the evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "787aeb49",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-07-11 12:26:58 - HTTP Request: POST https://app.langwatch.ai/api/experiment/init \"HTTP/1.1 200 OK\"\n",
      "Follow the results at: https://app.langwatch.ai/demo/experiments/pdf-parsing-evaluation?runId=stirring-lemming-of-maturity\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6b84c30fb43146f8afefbd3bb15276c1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2025-07-11 12:27:03 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:05 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:07 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:08 - Cannot set gray non-stroke color because /'p26' is an invalid float value\n",
      "2025-07-11 12:27:09 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:11 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:15 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:16 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n",
      "2025-07-11 12:27:16 - HTTP Request: POST https://app.langwatch.ai/api/evaluations/batch/log_results \"HTTP/1.1 200 OK\"\n"
     ]
    }
   ],
   "source": [
    "evaluation = langwatch.evaluation.init(\"pdf-parsing-evaluation\")\n",
    "\n",
    "for index, row in evaluation.loop(df.iterrows()):\n",
    "\n",
    "        response = extract_pdf_info(row[\"file\"])\n",
    "\n",
    "        evaluation.log(\n",
    "            \"author_names_accuracy\",\n",
    "            index=index,\n",
    "            passed=response.author_names == row[\"author_names\"],\n",
    "        )\n",
    "\n",
    "        return response\n",
    "\n",
    "    evaluation.submit(evaluate, index, row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9fa8b21",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
